Exercise 9.7

In this problem, you will use support vector approaches in order to predict whether a given car gets high or low gas mileage based on the Auto data set.

library(ISLR)
data(Auto)
summary(Auto)
##       mpg          cylinders      displacement     horsepower   
##  Min.   : 9.00   Min.   :3.000   Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:17.00   1st Qu.:4.000   1st Qu.:105.0   1st Qu.: 75.0  
##  Median :22.75   Median :4.000   Median :151.0   Median : 93.5  
##  Mean   :23.45   Mean   :5.472   Mean   :194.4   Mean   :104.5  
##  3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:275.8   3rd Qu.:126.0  
##  Max.   :46.60   Max.   :8.000   Max.   :455.0   Max.   :230.0  
##                                                                 
##      weight      acceleration        year           origin     
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   Min.   :1.000  
##  1st Qu.:2225   1st Qu.:13.78   1st Qu.:73.00   1st Qu.:1.000  
##  Median :2804   Median :15.50   Median :76.00   Median :1.000  
##  Mean   :2978   Mean   :15.54   Mean   :75.98   Mean   :1.577  
##  3rd Qu.:3615   3rd Qu.:17.02   3rd Qu.:79.00   3rd Qu.:2.000  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   Max.   :3.000  
##                                                                
##                  name    
##  amc matador       :  5  
##  ford pinto        :  5  
##  toyota corolla    :  5  
##  amc gremlin       :  4  
##  amc hornet        :  4  
##  chevrolet chevette:  4  
##  (Other)           :365

(a). Create a binary variable that takes on a 1 for cars with gas mileage above the median, and a 0 for cars with gas mileage below the median.

med = median(Auto$mpg)
bin.var = ifelse(Auto$mpg > med, 1, 0)
Auto$mpglevel = as.factor(bin.var)
Auto$mpglevel
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0
##  [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
##  [71] 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0
## [106] 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1
## [141] 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0
## [176] 1 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0
## [211] 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1
## [246] 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 0 0 0 1 1 0 0
## [281] 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## [386] 1 1 1 1 1 1 1
## Levels: 0 1

(b). Fit a support vector classifier to the data with various values of “cost”, in order to predict whether a car gets high of low gas mileage. Report the cross-validation errors associated with different values of this parameter.

#The e1071 library includes a built-in function, tune(), to perform cross- validation. By default, tune() performs ten-fold cross-validation on a set of models of interest.
set.seed(34567)
library(e1071)
#The following command indicates that we want to compare SVMs with a linear kernel, using a range of values of the cost parameter.
tune.fit = tune(svm, mpglevel ~ ., data = Auto, kernel = "linear", 
                ranges = list(cost = c(0.5, 1, 5, 10, 25, 50, 100)))
#We can easily access the cross-validation errors for each of these models using the summary() command.
summary(tune.fit)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##     1
## 
## - best performance: 0.01532051 
## 
## - Detailed performance results:
##    cost      error dispersion
## 1   0.5 0.02288462 0.02226748
## 2   1.0 0.01532051 0.02158955
## 3   5.0 0.01538462 0.02162241
## 4  10.0 0.02051282 0.02356248
## 5  25.0 0.03314103 0.02101863
## 6  50.0 0.03564103 0.02445289
## 7 100.0 0.03564103 0.02445289
# #We see that cost=1 results in the lowest cross-validation error rate. 
#The tune() function stores the best model obtained, which can be accessed as follows.
bestmod=tune.fit$best.model 
summary(bestmod)
## 
## Call:
## best.tune(method = svm, train.x = mpglevel ~ ., data = Auto, 
##     ranges = list(cost = c(0.5, 1, 5, 10, 25, 50, 100)), kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
##       gamma:  0.003205128 
## 
## Number of Support Vectors:  56
## 
##  ( 26 30 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

(c). Now repeat (b), this time using SVMs with radial and polynomial basis kernels, with different values of “gamma” and “degree” and “cost”.

tune.fit1 <- tune(svm, mpglevel ~ ., data = Auto, kernel = "polynomial", 
           ranges = list(cost = c(0.1, 1, 5, 10, 25, 50, 100), degree = c(2, 3, 4)))
summary(tune.fit1)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost degree
##   100      2
## 
## - best performance: 0.3038462 
## 
## - Detailed performance results:
##     cost degree     error dispersion
## 1    0.1      2 0.5716667 0.06669529
## 2    1.0      2 0.5716667 0.06669529
## 3    5.0      2 0.5716667 0.06669529
## 4   10.0      2 0.5280769 0.10077581
## 5   25.0      2 0.4548718 0.11439078
## 6   50.0      2 0.3523718 0.10155039
## 7  100.0      2 0.3038462 0.07176689
## 8    0.1      3 0.5641667 0.08389600
## 9    1.0      3 0.5641667 0.08389600
## 10   5.0      3 0.5641667 0.08389600
## 11  10.0      3 0.5641667 0.08389600
## 12  25.0      3 0.5641667 0.08389600
## 13  50.0      3 0.4872436 0.07985420
## 14 100.0      3 0.3396795 0.13294392
## 15   0.1      4 0.5741667 0.06192605
## 16   1.0      4 0.5741667 0.06192605
## 17   5.0      4 0.5741667 0.06192605
## 18  10.0      4 0.5741667 0.06192605
## 19  25.0      4 0.5741667 0.06192605
## 20  50.0      4 0.5741667 0.06192605
## 21 100.0      4 0.5741667 0.06192605
# lowest cross-validation error is obtained for cost = 100 and degree = 2
tune.fit2 <- tune(svm, mpglevel ~ ., data = Auto, kernel = "radial", 
            ranges = list(cost = c(0.1, 1, 5, 10), 
                    gamma = c(0.01, 0.1, 1, 5, 10, 100)))

summary(tune.fit2)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost gamma
##    10  0.01
## 
## - best performance: 0.02814103 
## 
## - Detailed performance results:
##    cost gamma      error dispersion
## 1   0.1 1e-02 0.08929487 0.03470503
## 2   1.0 1e-02 0.07141026 0.03580472
## 3   5.0 1e-02 0.05108974 0.02709895
## 4  10.0 1e-02 0.02814103 0.02246020
## 5   0.1 1e-01 0.07653846 0.03199026
## 6   1.0 1e-01 0.05621795 0.03594072
## 7   5.0 1e-01 0.03564103 0.02142556
## 8  10.0 1e-01 0.03057692 0.02010376
## 9   0.1 1e+00 0.54826923 0.03453622
## 10  1.0 1e+00 0.05102564 0.03973118
## 11  5.0 1e+00 0.05865385 0.04942437
## 12 10.0 1e+00 0.05865385 0.04942437
## 13  0.1 5e+00 0.54826923 0.03453622
## 14  1.0 5e+00 0.51243590 0.06765660
## 15  5.0 5e+00 0.50487179 0.06329924
## 16 10.0 5e+00 0.50487179 0.06329924
## 17  0.1 1e+01 0.54826923 0.03453622
## 18  1.0 1e+01 0.52275641 0.06191809
## 19  5.0 1e+01 0.51762821 0.05645657
## 20 10.0 1e+01 0.51762821 0.05645657
## 21  0.1 1e+02 0.54826923 0.03453622
## 22  1.0 1e+02 0.54826923 0.03453622
## 23  5.0 1e+02 0.54826923 0.03453622
## 24 10.0 1e+02 0.54826923 0.03453622
#For radial lowest cross-validation error is obtained for cost = 10 and gamma = 0.01

(d). Make some plots to back up your assertions in (b) and (c).

svm.linear <- svm(mpglevel ~ ., data = Auto, kernel = "linear", cost = 1)
svm.polynomial <- svm(mpglevel ~ ., data = Auto, kernel = "polynomial", cost = 100, degree = 2)
svm.radial <- svm(mpglevel ~ ., data = Auto, kernel = "radial", cost = 100, gamma = 0.01)
plotpairs = function(fit) {
  for (name in names(Auto)[!(names(Auto) %in% c("mpg", "mpglevel", "name"))]) {
    plot(fit, Auto, as.formula(paste("mpg ~", name, sep = "")))
  }
}
plotpairs(svm.linear)

plotpairs(svm.polynomial)

plotpairs(svm.radial)